library(here)
library(tidyverse)
library(ggpubr)
library(epitools)
pal2 <- c("#015b58", "#5962b5")
pal2.flip <- c("#5962b5", "#015b58")
here::here()
[1] "/Users/emma/Library/CloudStorage/OneDrive-SharedLibraries-IndianaUniversity/Lennon, Jay - 0000_Bueren/Projects/LifeStyle/PhageLifestyleSporulation"
box <- read.delim2(here("data/inphared_db/14Apr2025inph_0A_v3.txt" ))
box <- subset(box, box$Header!="Header")
box <- box %>% rename(product = Sequence, phage = Contig)
box[,c(2)] <- "0A_box"
#box <- box[,c(3,2,1)]
box$count <- 1
gc_content <- function(seq) {
seq <- toupper(seq)
bases <- strsplit(seq, "")[[1]]
gc_count <- sum(bases %in% c("G", "C"))
total <- length(bases)
return(gc_count / total)
}
# Vectorized version for multiple sequences
gc_content_vec <- function(seqs) {
sapply(seqs, gc_content)
}
box$GC.flanks <- gc_content_vec(box$Upstream)
box <- subset(box, box$GC.flanks<0.35)
box.all <- box
box <- box.all %>% group_by(phage, product) %>% summarise(sum=sum(count))
`summarise()` has grouped output by 'phage'. You can override using the `.groups` argument.
phage <- read.csv(here("data/inphared_db/14Apr2025_knownsporestatus.csv"), row.names=1)
all <- merge(box, phage, by.x="phage", by.y="Accession", all.x=TRUE, all.y=TRUE)
all$product[is.na(all$product)] <- "0A_box"
all$sum[is.na(all$sum)] <- 0
all[is.na(all)] <- "currently_missing"
all <- subset(all, all$newgtdb_Phylum!="currently_missing")
all$HostPhyla <- "Other"
all$HostPhyla <- ifelse(all$newgtdb_Phylum=="Bacillota", "Bacillota", "Other")
all$sporulation <- "unknown"
all$sporulation <- ifelse(all$f_spor=="TRUE", "Spor", all$sporulation)
all$sporulation <- ifelse(all$f_spor=="FALSE", "Nonspor", all$sporulation)
all <- subset(all, all$sporulation!="unknown" )
all <- subset(all, all$predicted_label!="missing")
all <- unite(all, "phyla_spor_type", c("HostPhyla", "sporulation", "lifestyle"), sep = "_", remove = FALSE, na.rm = FALSE)
all <- unite(all, "spor_type", c("sporulation", "lifestyle"), sep = "_", remove = FALSE, na.rm = FALSE)
all <- unite(all, "specphyla_spor_type", c("newgtdb_Phylum", "sporulation", "lifestyle"), sep = "_", remove = FALSE, na.rm = FALSE)
all$genome <- as.numeric(all$Genome.Length..bp.)
all$GC <- as.numeric(all$molGC....)
all$genome <- log10(all$genome)
all$tally <- 1
all$hit <- ifelse(all$sum!=0, 1, 0)
counts <- all %>% group_by(phyla_spor_type) %>% summarise(num.phage=sum(tally), total.0A=sum(sum), mean.0A=mean(sum), hits.0A=sum(hit))
### only accounting for sporulation and lifestyle
ggplot(all, aes(x = factor(spor_type), fill = factor(spor_type), y = sum )) +
geom_boxplot(binaxis = "y", stackdir = "center", position = "dodge") + geom_jitter(width = 0.2) + ylab("Number of AT-rich 0A boxes boxes") +theme(axis.text.x = element_text(angle = 50, vjust = 1, hjust = 1)) + theme(legend.position = "right") + xlab("Phyla_Spor_Lifestyle")
Warning: Ignoring unknown parameters: `binaxis` and `stackdir`
## accounting for bacilliota vs. other phyla
ggplot(all, aes(x = factor(phyla_spor_type), fill = factor(phyla_spor_type), y = sum )) +
geom_boxplot(binaxis = "y", stackdir = "center", position = "dodge") + geom_jitter(width = 0.2) + ylab("Number of AT-rich 0A boxes boxes")+theme(axis.text.x = element_text(angle = 50, vjust = 1, hjust = 1)) + theme(legend.position = "right", legend.title = element_blank()) + xlab("Sporulation Phenotype + Phage Lifestyle") + ggtitle(("# of 0A boxes in Bacillota vs. Other Phyla Phages")) +ylim(0,120)
Warning: Ignoring unknown parameters: `binaxis` and `stackdir`
ggsave(here("lab_pres/UpstreamFilt35_Inph0ACounts.png"))
Saving 7.29 x 4.51 in image
## linear reg
ggplot(all, aes(genome,sum, fill=phyla_spor_type, colour = phyla_spor_type)) +
geom_point() +
geom_smooth(method="lm")+ # Add regression line
stat_regline_equation(aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~~")),
label.x.npc = "left", label.y.npc = "top" ) + ylab("Number of AT-Rich Flanked 0A boxes") +xlab("log10(Phage Genome Size, bp)")# Add
ggsave(here("lab_pres/UpstreamFilt35_Inph0AReg.png"))
Saving 7.29 x 4.51 in image
ggplot(all, aes(GC,sum, fill=phyla_spor_type, colour = phyla_spor_type)) +
geom_point() +
geom_smooth(method="lm")+ # Add regression line
stat_regline_equation(aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~~")),
label.x.npc = "left", label.y.npc = "top" ) + ylab("Number of AT-rich 0A boxes boxes") +xlab("log10(Phage Genome Size, bp)")# Add
ggsave(here("lab_pres/UpstreamFilt35_Inph0AReg-GC.png"))
Saving 7.29 x 4.51 in image
ggplot(all, aes(x = factor(newgtdb_Phylum), fill = factor(predicted_label), y = sum )) + geom_boxplot(binaxis = “y”, stackdir = “center”, position = “dodge”) + geom_jitter(width = 0.2) + ylab(“Number of AT-rich 0A boxes boxes”)
ggplot(all, aes(x = factor(specphyla_spor_type), fill = factor(predicted_label), y = sum )) + geom_boxplot(binaxis = “y”, stackdir = “center”, position = “dodge”) + geom_jitter(width = 0.2) + ylab(“Number of AT-rich 0A boxes boxes”)
ggplot(all, aes(x = factor(phyla_spor_type), fill = factor(phyla_spor_type), y = sum )) + geom_boxplot(binaxis = “y”, stackdir = “center”, position = “dodge”) + geom_jitter(width = 0.2) + ylab(“Number of AT-rich 0A boxes boxes”) +theme(axis.text.x = element_text(angle = 50, vjust = 1, hjust = 1)) + theme(legend.position = “left”) + xlab(“Phyla_Spor_Lifestyle”)
ggplot(all, aes(x = factor(specphyla_spor_type), fill = factor(specphyla_spor_type), y = sum )) + geom_boxplot(binaxis = “y”, stackdir = “center”, position = “dodge”) + geom_jitter(width = 0.2) + ylab(“Number of AT-rich 0A boxes boxes”)
ggplot(all, aes(Genome.Length..bp.,sum, fill=specphyla_spor_type, colour = specphyla_spor_type)) + geom_point() + geom_smooth(method=“lm”)+ # Add regression line stat_regline_equation(aes(label = paste(..eq.label.., ..rr.label.., sep = “~~~~”)), label.x.npc = “middle”, label.y.npc = “top” )# Add equation#+ ylim(0, 25) #+ xlim(0,
a1 <- aov(sum ~ phyla_spor_type, data = all)
summary(a1)
Df Sum Sq Mean Sq F value Pr(>F)
phyla_spor_type 5 16889 3378 515.4 <2e-16 ***
Residuals 20516 134462 7
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
TukeyHSD(a1)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = sum ~ phyla_spor_type, data = all)
$phyla_spor_type
diff lwr upr p adj
Bacillota_Nonspor_Temp-Bacillota_Nonspor_Lytic 0.78755300 0.49493759 1.0801684 0.0000000
Bacillota_Spor_Lytic-Bacillota_Nonspor_Lytic 3.28620710 2.84163788 3.7307763 0.0000000
Bacillota_Spor_Temp-Bacillota_Nonspor_Lytic 0.76350198 0.35633808 1.1706659 0.0000013
Other_Nonspor_Lytic-Bacillota_Nonspor_Lytic 0.91293550 0.68075479 1.1451162 0.0000000
Other_Nonspor_Temp-Bacillota_Nonspor_Lytic -0.83996481 -1.07625541 -0.6036742 0.0000000
Bacillota_Spor_Lytic-Bacillota_Nonspor_Temp 2.49865410 2.06713149 2.9301767 0.0000000
Bacillota_Spor_Temp-Bacillota_Nonspor_Temp -0.02405102 -0.41692812 0.3688261 0.9999777
Other_Nonspor_Lytic-Bacillota_Nonspor_Temp 0.12538250 -0.08071906 0.3314841 0.5092498
Other_Nonspor_Temp-Bacillota_Nonspor_Temp -1.62751781 -1.83823852 -1.4167971 0.0000000
Bacillota_Spor_Temp-Bacillota_Spor_Lytic -2.52270512 -3.03881598 -2.0065943 0.0000000
Other_Nonspor_Lytic-Bacillota_Spor_Lytic -2.37327160 -2.76633122 -1.9802120 0.0000000
Other_Nonspor_Temp-Bacillota_Spor_Lytic -4.12617191 -4.52167314 -3.7306707 0.0000000
Other_Nonspor_Lytic-Bacillota_Spor_Temp 0.14943352 -0.20076145 0.4996285 0.8291778
Other_Nonspor_Temp-Bacillota_Spor_Temp -1.60346679 -1.95640004 -1.2505335 0.0000000
Other_Nonspor_Temp-Other_Nonspor_Lytic -1.75290031 -1.86553832 -1.6402623 0.0000000
all.spor.counts <- all %>% group_by(sporulation) %>% summarise(num.phage=sum(tally), total.0A=sum(sum), mean.0A=mean(sum), hits.0A=sum(hit))
all.spor.counts$perc.hit <- all.spor.counts$hits.0A/all.spor.counts$num.phage
all.spor.counts
spor.stat <- c('Spore', 'Non-spore')
outcome <- c('Hits', 'None')
all.hits <- matrix(c(736, 75, 18825, 887), nrow=2, ncol=2, byrow=TRUE)
dimnames(all.hits) <- list('Spore'=spor.stat, 'Outcome'=outcome)
all.hits
Outcome
Spore Hits None
Spore 736 75
Non-spore 18825 887
fisher.test(all.hits)
Fisher's Exact Test for Count Data
data: all.hits
p-value = 2.102e-08
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.3604635 0.6002062
sample estimates:
odds ratio
0.4624162
oddsratio(all.hits)
$data
Outcome
Spore Hits None Total
Spore 736 75 811
Non-spore 18825 887 19712
Total 19561 962 20523
$measure
odds ratio with 95% C.I.
Spore estimate lower upper
Spore 1.0000000 NA NA
Non-spore 0.4615714 0.3629677 0.5953388
$p.value
two-sided
Spore midp.exact fisher.exact chi.square
Spore NA NA NA
Non-spore 2.090057e-08 2.101833e-08 3.623752e-10
$correction
[1] FALSE
attr(,"method")
[1] "median-unbiased estimate & mid-p exact CI"
other phyla lifestyle
phyla <- subset(all, all$HostPhyla=="OtherPhyla")
phyla.life <- phyla %>% group_by(predicted_label) %>% summarise(num.phage=sum(tally), total.0A=sum(sum), mean.0A=mean(sum), hits.0A=sum(hit))
phyla.life
NA
phyla.stat <- c('Lytic', 'Temp')
outcome <- c('Hits', 'None')
phyla.hits <- matrix(c(7047, 239, 9544, 350), nrow=2, ncol=2, byrow=TRUE)
dimnames(phyla.hits) <- list('Lifestyle'=phyla.stat, 'Outcome'=outcome)
phyla.hits
Outcome
Lifestyle Hits None
Lytic 7047 239
Temp 9544 350
fisher.test(phyla.hits)
Fisher's Exact Test for Count Data
data: phyla.hits
p-value = 0.3731
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.9119789 1.2837726
sample estimates:
odds ratio
1.08131
oddsratio(phyla.hits)
$data
Outcome
Lifestyle Hits None Total
Lytic 7047 239 7286
Temp 9544 350 9894
Total 16591 589 17180
$measure
odds ratio with 95% C.I.
Lifestyle estimate lower upper
Lytic 1.000000 NA NA
Temp 1.081059 0.9150867 1.279243
$p.value
two-sided
Lifestyle midp.exact fisher.exact chi.square
Lytic NA NA NA
Temp 0.36041 0.3731368 0.3597985
$correction
[1] FALSE
attr(,"method")
[1] "median-unbiased estimate & mid-p exact CI"
baci <- subset(all, all$HostPhyla=="Bacillota")
ggplot(baci, aes(x = factor(spor_type), fill = factor(spor_type), y = sum )) +
geom_boxplot(binaxis = "y", stackdir = "center", position = "dodge") + geom_jitter(width = 0.2) + ylab("Number of AT-rich 0A boxes boxes")+theme(axis.text.x = element_text(angle = 50, vjust = 1, hjust = 1)) + theme(legend.position = "bottom", legend.title = element_blank()) + xlab("Bacillota Sporulation Phenotype + Phage Lifestyle") + ggtitle(("# of 0A boxes in Bacillota Phages"))
Warning: Ignoring unknown parameters: `binaxis` and `stackdir`
ggsave(here("lab_pres/UpstreamFilt35_Baci0ACounts.png"))
Saving 7.29 x 4.51 in image
ggplot(baci, aes(genome,sum, fill=phyla_spor_type, colour = phyla_spor_type)) +
geom_point() +
geom_smooth(method="lm")+ # Add regression line
stat_regline_equation(aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~~")),
label.x.npc = "left", label.y.npc = "top" ) + ylab("Number of AT-rich 0A boxes boxes") +xlab("log10(Phage Genome Size, bp)")# Add
ggsave(here("lab_pres/UpstreamFilt35_Baci0AReg.png"))
Saving 7.29 x 4.51 in image
spor.counts <- baci %>% group_by(sporulation) %>% summarise(num.phage=sum(tally), total.0A=sum(sum), mean.0A=mean(sum), hits.0A=sum(hit))
spor.counts$perc.hit <- spor.counts$hits.0A/spor.counts$num.phage
spor.counts
NA
spor.stat <- c('Spore', 'Non-spore')
outcome <- c('Hits', 'None')
bacil.hits <- matrix(c(736, 75, 2234, 298), nrow=2, ncol=2, byrow=TRUE)
dimnames(bacil.hits) <- list('Spore'=spor.stat, 'Outcome'=outcome)
bacil.hits
Outcome
Spore Hits None
Spore 736 75
Non-spore 2234 298
library(epitools)
fisher.test(bacil.hits)
Fisher's Exact Test for Count Data
data: bacil.hits
p-value = 0.04714
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.9982116 1.7329346
sample estimates:
odds ratio
1.308888
oddsratio(bacil.hits)
$data
Outcome
Spore Hits None Total
Spore 736 75 811
Non-spore 2234 298 2532
Total 2970 373 3343
$measure
odds ratio with 95% C.I.
Spore estimate lower upper
Spore 1.000000 NA NA
Non-spore 1.306972 1.006351 1.71714
$p.value
two-sided
Spore midp.exact fisher.exact chi.square
Spore NA NA NA
Non-spore 0.04460157 0.04713798 0.04715564
$correction
[1] FALSE
attr(,"method")
[1] "median-unbiased estimate & mid-p exact CI"
virbac <- subset(baci, baci$predicted_label=="virulent")
life.counts <- virbac %>% group_by(sporulation) %>% summarise(num.phage=sum(tally), total.0A=sum(sum), mean.0A=mean(sum), hits.0A=sum(hit))
life.counts$perc.hit <- life.counts$hits.0A/life.counts$num.phage
life.counts
spor.stat <- c('Spore', 'Non-spore')
outcome <- c('Hits', 'None')
vir.hits <- matrix(c(338, 19, 1097, 246), nrow=2, ncol=2, byrow=TRUE)
dimnames(vir.hits) <- list('Spore'=spor.stat, 'Outcome'=outcome)
vir.hits
Outcome
Spore Hits None
Spore 338 19
Non-spore 1097 246
library(epitools)
fisher.test(vir.hits)
Fisher's Exact Test for Count Data
data: vir.hits
p-value = 7.378e-11
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
2.451394 6.843502
sample estimates:
odds ratio
3.986701
oddsratio(vir.hits)
$data
Outcome
Spore Hits None Total
Spore 338 19 357
Non-spore 1097 246 1343
Total 1435 265 1700
$measure
odds ratio with 95% C.I.
Spore estimate lower upper
Spore 1.000000 NA NA
Non-spore 3.957262 2.506967 6.626532
$p.value
two-sided
Spore midp.exact fisher.exact chi.square
Spore NA NA NA
Non-spore 4.492851e-11 7.377785e-11 1.784964e-09
$correction
[1] FALSE
attr(,"method")
[1] "median-unbiased estimate & mid-p exact CI"
tempbac <- subset(baci, baci$predicted_label=="temperate")
tlife.counts <- tempbac %>% group_by(sporulation) %>% summarise(num.phage=sum(tally), total.0A=sum(sum), mean.0A=mean(sum), hits.0A=sum(hit))
tlife.counts$perc.hit <- tlife.counts$hits.0A/tlife.counts$num.phage
tlife.counts
spor.stat <- c('Spore', 'Non-spore')
outcome <- c('Hits', 'None')
temp.hits <- matrix(c(398, 56, 1383, 52), nrow=2, ncol=2, byrow=TRUE)
dimnames(temp.hits) <- list('Spore'=spor.stat, 'Outcome'=outcome)
temp.hits
Outcome
Spore Hits None
Spore 398 56
Non-spore 1383 52
library(epitools)
fisher.test(temp.hits)
Fisher's Exact Test for Count Data
data: temp.hits
p-value = 1.385e-10
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.1766936 0.4042270
sample estimates:
odds ratio
0.2674352
oddsratio(temp.hits)
$data
Outcome
Spore Hits None Total
Spore 398 56 454
Non-spore 1383 52 1435
Total 1781 108 1889
$measure
odds ratio with 95% C.I.
Spore estimate lower upper
Spore 1.0000000 NA NA
Non-spore 0.2674937 0.179992 0.3969365
$p.value
two-sided
Spore midp.exact fisher.exact chi.square
Spore NA NA NA
Non-spore 1.32748e-10 1.385072e-10 3.217975e-12
$correction
[1] FALSE
attr(,"method")
[1] "median-unbiased estimate & mid-p exact CI"
other phyla lifestyle
sporbac <- subset(baci, baci$sporulation=="Spor")
spor.life <- sporbac %>% group_by(predicted_label) %>% summarise(num.phage=sum(tally), total.0A=sum(sum), mean.0A=mean(sum), hits.0A=sum(hit))
spor.life
NA
## this is spor temp vs. spor lytic of bacil
spor.stat <- c('Spore', 'Non-spore')
outcome <- c('Hits', 'None')
temp.hits <- matrix(c(357, 23, 454, 63), nrow=2, ncol=2, byrow=TRUE)
dimnames(temp.hits) <- list('Spore'=spor.stat, 'Outcome'=outcome)
temp.hits
Outcome
Spore Hits None
Spore 357 23
Non-spore 454 63
library(epitools)
fisher.test(temp.hits)
Fisher's Exact Test for Count Data
data: temp.hits
p-value = 0.001899
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
1.286520 3.711636
sample estimates:
odds ratio
2.152109
oddsratio(temp.hits)
$data
Outcome
Spore Hits None Total
Spore 357 23 380
Non-spore 454 63 517
Total 811 86 897
$measure
odds ratio with 95% C.I.
Spore estimate lower upper
Spore 1.000000 NA NA
Non-spore 2.143206 1.320225 3.597265
$p.value
two-sided
Spore midp.exact fisher.exact chi.square
Spore NA NA NA
Non-spore 0.001767875 0.001898591 0.002050377
$correction
[1] FALSE
attr(,"method")
[1] "median-unbiased estimate & mid-p exact CI"